{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##  Leave one out spatial cross validation for SD1"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1. Load the libraries for calculation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# import the libraries\n",
    "import ee\n",
    "import pandas as pd\n",
    "import os\n",
    "import numpy as np\n",
    "import random\n",
    "from random import sample\n",
    "import itertools \n",
    "import geopandas as gpd\n",
    "from sklearn.metrics import r2_score\n",
    "from termcolor import colored # this is allocate colour and fonts type for the print title and text\n",
    "from IPython.display import display, HTML"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "#set the working directory of local drive for Grid search result table loading\n",
    "# os.getcwd()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Intialize the ee API connection\n",
    "ee.Initialize()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2. Prepare the composite for calculation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Define the vectors of predictors\n",
    "predictorVector = ['Aridity_Index',\n",
    "               'CHELSA_Annual_Mean_Temperature',\n",
    "               'CHELSA_Annual_Precipitation',\n",
    "               'CHELSA_Isothermality',\n",
    "               'CHELSA_Max_Temperature_of_Warmest_Month',\n",
    "               'CHELSA_Mean_Diurnal_Range',\n",
    "               'CHELSA_Mean_Temperature_of_Coldest_Quarter',\n",
    "               'CHELSA_Mean_Temperature_of_Driest_Quarter',\n",
    "               'CHELSA_Mean_Temperature_of_Warmest_Quarter',\n",
    "               'CHELSA_Mean_Temperature_of_Wettest_Quarter',\n",
    "               'CHELSA_Min_Temperature_of_Coldest_Month',\n",
    "               'CHELSA_Precipitation_Seasonality',\n",
    "               'CHELSA_Precipitation_of_Coldest_Quarter',\n",
    "               'CHELSA_Precipitation_of_Driest_Month',\n",
    "               'CHELSA_Precipitation_of_Driest_Quarter',\n",
    "               'CHELSA_Precipitation_of_Warmest_Quarter',\n",
    "               'CHELSA_Precipitation_of_Wettest_Month',\n",
    "               'CHELSA_Precipitation_of_Wettest_Quarter',\n",
    "               'CHELSA_Temperature_Annual_Range',\n",
    "               'CHELSA_Temperature_Seasonality',\n",
    "               'Depth_to_Water_Table',\n",
    "               'EarthEnvTopoMed_Eastness',\n",
    "               'EarthEnvTopoMed_Elevation',\n",
    "               'EarthEnvTopoMed_Northness',\n",
    "               'EarthEnvTopoMed_ProfileCurvature',\n",
    "               'EarthEnvTopoMed_Roughness',\n",
    "               'EarthEnvTopoMed_Slope',\n",
    "               'SG_Absolute_depth_to_bedrock',\n",
    "               'WorldClim2_SolarRadiation_AnnualMean',\n",
    "               'WorldClim2_WindSpeed_AnnualMean',\n",
    "               'EarthEnvCloudCover_MODCF_interannualSD',\n",
    "               'EarthEnvCloudCover_MODCF_intraannualSD',\n",
    "               'EarthEnvCloudCover_MODCF_meanannual',\n",
    "               'EarthEnvTopoMed_AspectCosine',\n",
    "               'EarthEnvTopoMed_AspectSine',\n",
    "               'LandCoverClass_Cultivated_and_Managed_Vegetation',\n",
    "               'Human_Disturbance',\n",
    "               'LandCoverClass_Urban_Builtup',\n",
    "               'SG_Clay_Content_0_100cm',\n",
    "               'SG_Coarse_fragments_0_100cm',\n",
    "               'SG_Sand_Content_0_100cm',\n",
    "               'SG_Silt_Content_0_100cm',\n",
    "               'SG_Soil_pH_H2O_0_100cm',\n",
    "               'WDPA',\n",
    "               'cropland',\n",
    "               'grazing',\n",
    "               'pasture',\n",
    "               'rangeland',\n",
    "               'PresentTreeCover']\n",
    "# define the dependent variable\n",
    "varToModel = 'RemoteBiomass'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# # load the biomass map \n",
    "# # transfer biomass to carbon stock with the factor 0.5\n",
    "# biomassDensityMap = ee.Image(\"users/leonidmoore/ForestBiomass/RemoteSensingModel/ESA_CCI_AGB_Map_bias_corrected_1km_2010\").multiply(0.5).rename('RemoteBiomass')\n",
    "# # load the hansen forest cover, and transform to 0-1\n",
    "# hansenTreeCover = ee.Image(\"projects/crowtherlab/Composite/CrowtherLab_Composite_30ArcSec\").select('HansenEtAl_TreeCover_Year2010').divide(100).rename('PresentTreeCover')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "# # read the composite\n",
    "# compositeImage = ee.Image(\"users/leonidmoore/ForestBiomass/20200915_Forest_Biomass_Predictors_Image\").addBands(biomassDensityMap).addBands(hansenTreeCover)\n",
    "# # show the band names of the composite image \n",
    "# print('Composite Band Names:',compositeImage.bandNames().getInfo())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3. Prepare train data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "# # load the full train data\n",
    "# fullRandomPoints = ee.FeatureCollection(\"users/leonidmoore/ForestBiomass/RemoteSensingModel/RandomPoints/Random_Points_Shp_100000\")\n",
    "# print(fullRandomPoints.size().getInfo())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4. Spatial cross validation for each Biome"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 4.1 define the functions needed for spatial CV"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Define list contains the buffer sizes to test\n",
    "buffer_sizes = 900000 # 900km\n",
    "\n",
    "# define the core function for spatial cross validation\n",
    "#  Blocked Leave One Out cross-validation function:\n",
    "def BLOOcv(f):\n",
    "    rep = f.get('rep')\n",
    "    # Test feature\n",
    "    testFC = ee.FeatureCollection(f)\n",
    "\n",
    "    # Training set: all samples not within geometry of test feature\n",
    "    trainFC = perBootstrapTable.filter(ee.Filter.geometry(testFC).Not())\n",
    "\n",
    "    # Classifier to test\n",
    "    classifier = ee.Classifier.smileRandomForest(\n",
    "        numberOfTrees=200,\n",
    "        variablesPerSplit = variablesPerSplitVal,\n",
    "        minLeafPopulation = minLeafPopulationVal,\n",
    "        maxNodes = maxNodesVal,\n",
    "        bagFraction=0.632,\n",
    "        seed = seedVal).setOutputMode('REGRESSION')\n",
    "    \n",
    "    # define the Train classifier\n",
    "    trainedClassifer = classifier.train(trainFC, varToModel, predictorVector)\n",
    "    # Apply classifier to the feature collection\n",
    "    classified = testFC.classify(classifier = trainedClassifer,\n",
    "                                 outputName = 'predicted')\n",
    "    # Get predicted value\n",
    "    predicted = classified.first().get('predicted')\n",
    "    # return the predicted value for each feature\n",
    "    return f.set('predicted', predicted).copyProperties(f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Define the R^2 claculation function\n",
    "def coefficientOfDetermination(fcOI,propertyOfInterest,propertyOfInterest_Predicted):\n",
    "    # Compute the mean of the property of interest\n",
    "    propertyOfInterestMean = ee.Number(ee.Dictionary(ee.FeatureCollection(fcOI).select([propertyOfInterest]).reduceColumns(ee.Reducer.mean(),[propertyOfInterest])).get('mean'));\n",
    "    # Compute the total sum of squares\n",
    "    def totalSoSFunction(f):\n",
    "        return f.set('Difference_Squared',ee.Number(ee.Feature(f).get(propertyOfInterest)).subtract(propertyOfInterestMean).pow(ee.Number(2)))\n",
    "    totalSumOfSquares = ee.Number(ee.Dictionary(ee.FeatureCollection(fcOI).map(totalSoSFunction).select(['Difference_Squared']).reduceColumns(ee.Reducer.sum(),['Difference_Squared'])).get('sum'))\n",
    "    # Compute the residual sum of squares\n",
    "    def residualSoSFunction(f):\n",
    "        return f.set('Residual_Squared',ee.Number(ee.Feature(f).get(propertyOfInterest)).subtract(ee.Number(ee.Feature(f).get(propertyOfInterest_Predicted))).pow(ee.Number(2)))\n",
    "    residualSumOfSquares = ee.Number(ee.Dictionary(ee.FeatureCollection(fcOI).map(residualSoSFunction).select(['Residual_Squared']).reduceColumns(ee.Reducer.sum(),['Residual_Squared'])).get('sum'))\n",
    "    # Finalize the calculation\n",
    "    r2 = ee.Number(1).subtract(residualSumOfSquares.divide(totalSumOfSquares))\n",
    "    return ee.Number(r2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "# define the R2 calc function \n",
    "def calc_final_r2(buffer_feat):\n",
    "    rep = buffer_feat.get('rep')\n",
    "    # Add buffer to FC of sampled observations\n",
    "    buffer = buffer_feat.get('buffer_size')\n",
    "    \n",
    "    # Sample 1000 validation points from the data\n",
    "    subsetData = perBootstrapTable.randomColumn(seed = rep).sort('random').limit(n_points)\n",
    "\n",
    "    fc_wBuffer = subsetData.map(lambda f: f.buffer(buffer))\n",
    "    fc_toValidate = fc_wBuffer.map(lambda f: f.set('rep', rep))\n",
    "    # Apply blocked leave one out CV function\n",
    "    predicted = fc_toValidate.map(BLOOcv)\n",
    "    # Calculate R2 value\n",
    "    R2_val = coefficientOfDetermination(predicted, varToModel, 'predicted')\n",
    "    return(buffer_feat.set('R2_val', R2_val))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "# define the PredObs calc function \n",
    "def calc_Pred_Obs(buffer_feat):\n",
    "    rep = buffer_feat.get('rep')\n",
    "    # Add buffer to FC of sampled observations\n",
    "    buffer = buffer_feat.get('buffer_size')\n",
    "    \n",
    "    # Sample 1000 validation points from the data\n",
    "    subsetData = perBootstrapTable.randomColumn(seed = rep).sort('random').limit(n_points)\n",
    "\n",
    "    # fc_wBuffer = subsetData.map(lambda f: f.buffer(buffer))\n",
    "    fc_toValidate = subsetData.map(lambda f: f.set('rep', rep))\n",
    "    # Apply blocked leave one out CV function\n",
    "    predicted = fc_toValidate.map(BLOOcv)\n",
    "    # Uncomment the lines below to export the predicted/observed data per buffer size\n",
    "    predObs = predicted.select([varToModel, 'predicted']).map(lambda f: f.set('rep', rep))\n",
    "    return(predObs)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 4.2 calculate the spatial CV R2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[1m\u001b[34mThe seeds are:\u001b[0m [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99]\n",
      "\u001b[1m\u001b[34mModel is running!\u001b[0m\n"
     ]
    }
   ],
   "source": [
    "# generate a ee.List to save the seeds\n",
    "seedList = np.arange(0, 100, 1).tolist()\n",
    "print(colored('The seeds are:', 'blue', attrs=['bold']),seedList)\n",
    "print(colored('Model is running!', 'blue', attrs=['bold']))\n",
    "for seed in seedList:\n",
    "    n_reps = 2\n",
    "    nList = list(range(0,n_reps))\n",
    "    n_points = 1000\n",
    "    #  define a feature collection to save the calcuation results\n",
    "    bloo_cv_fc = ee.FeatureCollection(ee.List(nList).map(lambda n: ee.Feature(ee.Geometry.Point([0,0])).set('buffer_size',buffer_sizes).set('rep',n)))\n",
    "    # load the train table\n",
    "    perBootstrapTable = ee.FeatureCollection('users/nordmannmoore/ForestBiomass/RemoteSensingModel/TrainTables/SD1_Random_Subsampled_Train_Table_seed_'+str(seed))\n",
    "    # print(trainTable.size().getInfo())\n",
    "    parameterTable = pd.read_csv('Data/SatelliteDerivedModel/GridSearchResult/SD1_Potential_Biomass_Modeling_Grid_Search_Seed_'+str(seed)+'.csv', float_precision='round_trip')\n",
    "    # extract the paramters\n",
    "    variablesPerSplitVal = int(parameterTable['variablesPerSplit'].iat[0]) # mtry\n",
    "    minLeafPopulationVal = int(parameterTable['minLeafPopulation'].iat[0]) # minrow\n",
    "    maxNodesVal = int(parameterTable['maxNodes'].iat[0]) # mac depth\n",
    "    seedVal = seed\n",
    "    # Calculate predObs across range of R2 values\n",
    "    final_fc = bloo_cv_fc.map(calc_final_r2)\n",
    "    #     rSquared_export = ee.batch.Export.table.toAsset(\n",
    "    #         collection = final_fc,\n",
    "    #         description = varToModel+'bloo_cv_RM_rSqured_'+str(buffer_sizes)+'m_'+str(seed),\n",
    "    #         assetId = 'users/leonidmoore/ForestBiomass/RemoteSensingModel/SpatialCrossValidation/Remote_Sensing_Spatial_CV_Rsquared_'+str(buffer_sizes)+'m_'+str(seed))\n",
    "    #     # start to the running\n",
    "    #rSquared_export.start()\n",
    "    predObs_10_Folds_CV_Export = ee.batch.Export.table.toCloudStorage(\n",
    "        collection = final_fc,\n",
    "        description = 'SD1_Leav_One_Out_Cross_Validation_rSquared_'+str(seed),\n",
    "        bucket = \"crowtherlab_gcsb_lidong\",\n",
    "        fileNamePrefix = 'LOOCV_Results/Model_SD1_Leav_One_Out_Cross_Validation_rSquared_'+str(seed),\n",
    "        fileFormat ='CSV')\n",
    "    \n",
    "    predObs_10_Folds_CV_Export.start()\n",
    "    predObs_10_Folds_CV_Export.status()\n",
    "    \n",
    "    "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 4.3 calculate the spatial CV predicted VS observed"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[1m\u001b[34mThe seeds are:\u001b[0m [0]\n",
      "\u001b[1m\u001b[34mModel is running!\u001b[0m\n"
     ]
    }
   ],
   "source": [
    "# # generate a ee.List to save the seeds\n",
    "# seedList = np.arange(0, 1, 1).tolist()\n",
    "# print(colored('The seeds are:', 'blue', attrs=['bold']),seedList)\n",
    "# print(colored('Model is running!', 'blue', attrs=['bold']))\n",
    "# for seed in seedList:\n",
    "#     n_reps = 5\n",
    "#     nList = list(range(0,n_reps))\n",
    "#     n_points = 1000\n",
    "#     #  define a feature collection to save the calcuation results\n",
    "#     bloo_cv_fc = ee.FeatureCollection(ee.List(nList).map(lambda n: ee.Feature(ee.Geometry.Point([0,0])).set('buffer_size',buffer_sizes).set('rep',n)))\n",
    "#     # read the train table for each dataset\n",
    "#     perBootstrapTable = ee.FeatureCollection('users/leonidmoore/ForestBiomass/RemoteSensingModel/TrainTables/Remote_Sensing_Random_Subsampled_Train_Table_seed_'+str(seed))\n",
    "#     #load the parameter table\n",
    "#     parameterTable = pd.read_csv('RemoteSensingModel/GridSearchResult/Remote_Sensing_Biomass_Modeling_Grid_Search_Seed_'+str(seed)+'.csv', float_precision='round_trip')\n",
    "#     # extract the paramters\n",
    "#     variablesPerSplitVal = int(parameterTable['variablesPerSplit'].iat[0]) # mtry\n",
    "#     minLeafPopulationVal = int(parameterTable['minLeafPopulation'].iat[0]) # minrow\n",
    "#     maxNodesVal = int(parameterTable['maxNodes'].iat[0]) # mac depth\n",
    "#     seedVal = seed\n",
    "#     # Calculate predObs across range of R2 values\n",
    "#     final_PredObs = bloo_cv_fc.map(calc_Pred_Obs)\n",
    "#     # flatten the featureCollection of featureCollection for easy writing to google earth engine asset\n",
    "#     filteredData = final_PredObs.flatten()\n",
    "#     # define the exportation code\n",
    "#     predObs_export = ee.batch.Export.table.toAsset(\n",
    "#         collection = filteredData,\n",
    "#         description = varToModel+'_SD1_Spatial_Cross_Validation_PredObs_'+str(buffer_sizes)+'m_'+str(seed),\n",
    "#     assetId = 'users/leonidmoore/ForestBiomass/RemoteSensingModel/SpatialCrossValidation/Remote_Sensing_SD1_Spatial_CV_PredObs_'+str(buffer_sizes)+'m_'+str(seed))\n",
    "#     # start the exportation\n",
    "#     predObs_export.start()\n",
    "# #     leave_OneOut_CV_Export = ee.batch.Export.table.toDrive(\n",
    "# #         collection = filteredData,\n",
    "# #         description = 'Leav_One_Out_Cross_Validation_rSquared_'+str(seed),\n",
    "# #         fileNamePrefix = 'Remote_Sensing_Leav_One_Out_Cross_Validation_rSquared_'+str(seed)+'.csv',\n",
    "# #         fileFormat ='CSV')\n",
    "# #     leave_OneOut_CV_Export.start()\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
